import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
from matplotlib import cm
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from scipy.spatial.distance import pdist
data = pd.read_excel("Credit Card Customer Data.xlsx")
data.head()
#There are duplicates in the dataset. Our next step would be delete the repeated rows
data[data.duplicated(subset=['Customer Key'])]
#Deleting the duplictes from the dataset
data.drop_duplicates(subset=['Customer Key'],inplace=True)
data.info()
data.dtypes
data.shape
#Looking at the details of the dataset and how the data is distributed across all columns
data.describe()
#Check for any missing data
data.isna().sum()
#Examine the correkation between dimensions
data.corr(method='kendall')
#Pair plot might give some insight to the number of clusters that the data might have
sns.pairplot(data,diag_kind='kde')
#Dropping the variables that arent needed
data = data.drop(['Sl_No','Customer Key'],axis=1)
#Normalize the variables
data = data.apply(zscore)
data.head()
#Cluster Building
cluster_range = range(1,10)
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans(num_clusters, n_init = 5)
clusters.fit(data)
labels = clusters.labels_
centroids = clusters.cluster_centers_
cluster_errors.append(clusters.inertia_)
clusters_df = pd.DataFrame({"num_clusters": cluster_range, "cluster_errors": cluster_errors})
clusters_df[0:15]
#Having a figure to detrmine the number of clusters using Elbow point
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
#Elbow point is @ 4
kmeans = KMeans(n_clusters=4, n_init = 5, random_state=12345)
kmeans.fit(data)
#Calucalting and displaying the Silhouette score for K-means clustering
score_kmeans = silhouette_score(data, kmeans.labels_, metric='euclidean')
score_kmeans
# Check the number of data points in each cluster
labels = kmeans.labels_
counts = np.bincount(labels[labels>=0])
print(counts)
#centers in each group
centroids = kmeans.cluster_centers_
centroid_df = pd.DataFrame(centroids, columns = list(data) )
centroid_df.transpose()
# Add cluster number to original data
predictions = kmeans.predict(data)
predictions
data["group"] = predictions
data['group'] = data['group'].astype('category')
# Visualize the centers
data["group"] = predictions
data.boxplot(by = 'group', layout=(3,4), figsize=(15, 10))
#removing the 'groups' variable defined in the K-means clustering
data_hier = data.drop(['group'],axis=1)
#Doing the Hierarchical Clustering
hier_model = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='average')
hier_model.fit(data_hier)
#Number of customers in each of the defined labels in the dataset
data_hier['labels'] = hier_model.labels_
data_hier.head(10)
data_hier.groupby(["labels"]).count()
#Calucalting and sisplaying the Cophenet
Z = linkage(data_hier, metric='euclidean', method='average')
c, coph_dists = cophenet(Z , pdist(data_hier))
c
#View the Dendogram using Average linkage
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram using Average linkage')
dendrogram(Z, leaf_rotation=90.,color_threshold = 40, leaf_font_size=8. )
plt.tight_layout()
#Performing the clustering using different linkages
Z = linkage(data_hier, metric='euclidean', method='complete')
c, coph_dists = cophenet(Z , pdist(data_hier))
c
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram using Complete linkage')
dendrogram(Z, leaf_rotation=90.,color_threshold = 40, leaf_font_size=8. )
plt.tight_layout()
Z = linkage(data_hier, metric='euclidean', method='ward')
c, coph_dists = cophenet(Z , pdist(data_hier))
c
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram using Ward linkage')
dendrogram(Z, leaf_rotation=90.,color_threshold = 40, leaf_font_size=8. )
plt.tight_layout()
#Box plots on the clusters
data_hier.boxplot(by = 'labels', layout=(3,4), figsize=(15, 10))
#Calculate and display the Silhouette score using Hierarchical Clustering
score_hier = silhouette_score(data_hier, kmeans.labels_, metric='euclidean')
score_hier
#Comparing the Silhouette score using the clustering techniques
print(f"Silhouette Score\nKmeans:{score_kmeans}\nHieracrhical Clustering:{score_hier}")
#Key Questions:
# How many different segments of customers are there?
#Answer: There are 3 types customers
# How are these segments different from each other?
#Answer: Group 1 has: Low Credit Limit, posses 1-2 credit cards,Low calls made, High visits to the branch, Low online visits
#Group 2 has: Very high credit limit, more than 1 credit card, low calls made, low visit to the branch, very high online visits
#Group 3: Very low credit limit, 0 credit cards,highest call made among the groups, some customers prefer to visit the branch, moderate use of online visits
# What are your recommendations to the bank on how to better market to and service these customers?
#Answer: For Group 1: Increase cedit limit so as to get them spending more, encourage more online interaction
#For Group 2: Upsell more offers for adding a new credit cards
#For Group 3: Try to have them onboard for having a credit card, incentivise the usage of credit card.